import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score,auc, precision_recall_curve
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import xgboost as xgb
df = pd.read_csv("/Users/motlegoland/Desktop/data analyst/projects/project portfolio/fraud detection/data/creditcard_csv.csv")
df
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0 | -1.359807 | -0.072781 | 2.536347 | 1.378155 | -0.338321 | 0.462388 | 0.239599 | 0.098698 | 0.363787 | ... | -0.018307 | 0.277838 | -0.110474 | 0.066928 | 0.128539 | -0.189115 | 0.133558 | -0.021053 | 149.62 | '0' |
| 1 | 0.0 | 1.191857 | 0.266151 | 0.166480 | 0.448154 | 0.060018 | -0.082361 | -0.078803 | 0.085102 | -0.255425 | ... | -0.225775 | -0.638672 | 0.101288 | -0.339846 | 0.167170 | 0.125895 | -0.008983 | 0.014724 | 2.69 | '0' |
| 2 | 1.0 | -1.358354 | -1.340163 | 1.773209 | 0.379780 | -0.503198 | 1.800499 | 0.791461 | 0.247676 | -1.514654 | ... | 0.247998 | 0.771679 | 0.909412 | -0.689281 | -0.327642 | -0.139097 | -0.055353 | -0.059752 | 378.66 | '0' |
| 3 | 1.0 | -0.966272 | -0.185226 | 1.792993 | -0.863291 | -0.010309 | 1.247203 | 0.237609 | 0.377436 | -1.387024 | ... | -0.108300 | 0.005274 | -0.190321 | -1.175575 | 0.647376 | -0.221929 | 0.062723 | 0.061458 | 123.50 | '0' |
| 4 | 2.0 | -1.158233 | 0.877737 | 1.548718 | 0.403034 | -0.407193 | 0.095921 | 0.592941 | -0.270533 | 0.817739 | ... | -0.009431 | 0.798278 | -0.137458 | 0.141267 | -0.206010 | 0.502292 | 0.219422 | 0.215153 | 69.99 | '0' |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 284802 | 172786.0 | -11.881118 | 10.071785 | -9.834783 | -2.066656 | -5.364473 | -2.606837 | -4.918215 | 7.305334 | 1.914428 | ... | 0.213454 | 0.111864 | 1.014480 | -0.509348 | 1.436807 | 0.250034 | 0.943651 | 0.823731 | 0.77 | '0' |
| 284803 | 172787.0 | -0.732789 | -0.055080 | 2.035030 | -0.738589 | 0.868229 | 1.058415 | 0.024330 | 0.294869 | 0.584800 | ... | 0.214205 | 0.924384 | 0.012463 | -1.016226 | -0.606624 | -0.395255 | 0.068472 | -0.053527 | 24.79 | '0' |
| 284804 | 172788.0 | 1.919565 | -0.301254 | -3.249640 | -0.557828 | 2.630515 | 3.031260 | -0.296827 | 0.708417 | 0.432454 | ... | 0.232045 | 0.578229 | -0.037501 | 0.640134 | 0.265745 | -0.087371 | 0.004455 | -0.026561 | 67.88 | '0' |
| 284805 | 172788.0 | -0.240440 | 0.530483 | 0.702510 | 0.689799 | -0.377961 | 0.623708 | -0.686180 | 0.679145 | 0.392087 | ... | 0.265245 | 0.800049 | -0.163298 | 0.123205 | -0.569159 | 0.546668 | 0.108821 | 0.104533 | 10.00 | '0' |
| 284806 | 172792.0 | -0.533413 | -0.189733 | 0.703337 | -0.506271 | -0.012546 | -0.649617 | 1.577006 | -0.414650 | 0.486180 | ... | 0.261057 | 0.643078 | 0.376777 | 0.008797 | -0.473649 | -0.818267 | -0.002415 | 0.013649 | 217.00 | '0' |
284807 rows × 31 columns
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 284807 entries, 0 to 284806 Data columns (total 31 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Time 284807 non-null float64 1 V1 284807 non-null float64 2 V2 284807 non-null float64 3 V3 284807 non-null float64 4 V4 284807 non-null float64 5 V5 284807 non-null float64 6 V6 284807 non-null float64 7 V7 284807 non-null float64 8 V8 284807 non-null float64 9 V9 284807 non-null float64 10 V10 284807 non-null float64 11 V11 284807 non-null float64 12 V12 284807 non-null float64 13 V13 284807 non-null float64 14 V14 284807 non-null float64 15 V15 284807 non-null float64 16 V16 284807 non-null float64 17 V17 284807 non-null float64 18 V18 284807 non-null float64 19 V19 284807 non-null float64 20 V20 284807 non-null float64 21 V21 284807 non-null float64 22 V22 284807 non-null float64 23 V23 284807 non-null float64 24 V24 284807 non-null float64 25 V25 284807 non-null float64 26 V26 284807 non-null float64 27 V27 284807 non-null float64 28 V28 284807 non-null float64 29 Amount 284807 non-null float64 30 Class 284807 non-null object dtypes: float64(30), object(1) memory usage: 67.4+ MB
no null values detected, now we will convert the 'Class' series to an integer and Identify the proportion of fraudulent transactions compared to non-fraudulent ones.
df['Class'] = df['Class'].str.strip("'").astype('int8')
df['Class'].value_counts(normalize=True)
Class 0 0.998273 1 0.001727 Name: proportion, dtype: float64
very low fraud case proportion means the data set is unbalanced
df.describe().round(2)
| Time | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V21 | V22 | V23 | V24 | V25 | V26 | V27 | V28 | Amount | Class | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | ... | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 | 284807.00 |
| mean | 94813.86 | 0.00 | 0.00 | -0.00 | 0.00 | 0.00 | 0.00 | -0.00 | 0.00 | -0.00 | ... | 0.00 | -0.00 | 0.00 | 0.00 | 0.00 | 0.00 | -0.00 | -0.00 | 88.35 | 0.00 |
| std | 47488.15 | 1.96 | 1.65 | 1.52 | 1.42 | 1.38 | 1.33 | 1.24 | 1.19 | 1.10 | ... | 0.73 | 0.73 | 0.62 | 0.61 | 0.52 | 0.48 | 0.40 | 0.33 | 250.12 | 0.04 |
| min | 0.00 | -56.41 | -72.72 | -48.33 | -5.68 | -113.74 | -26.16 | -43.56 | -73.22 | -13.43 | ... | -34.83 | -10.93 | -44.81 | -2.84 | -10.30 | -2.60 | -22.57 | -15.43 | 0.00 | 0.00 |
| 25% | 54201.50 | -0.92 | -0.60 | -0.89 | -0.85 | -0.69 | -0.77 | -0.55 | -0.21 | -0.64 | ... | -0.23 | -0.54 | -0.16 | -0.35 | -0.32 | -0.33 | -0.07 | -0.05 | 5.60 | 0.00 |
| 50% | 84692.00 | 0.02 | 0.07 | 0.18 | -0.02 | -0.05 | -0.27 | 0.04 | 0.02 | -0.05 | ... | -0.03 | 0.01 | -0.01 | 0.04 | 0.02 | -0.05 | 0.00 | 0.01 | 22.00 | 0.00 |
| 75% | 139320.50 | 1.32 | 0.80 | 1.03 | 0.74 | 0.61 | 0.40 | 0.57 | 0.33 | 0.60 | ... | 0.19 | 0.53 | 0.15 | 0.44 | 0.35 | 0.24 | 0.09 | 0.08 | 77.16 | 0.00 |
| max | 172792.00 | 2.45 | 22.06 | 9.38 | 16.88 | 34.80 | 73.30 | 120.59 | 20.01 | 15.59 | ... | 27.20 | 10.50 | 22.53 | 4.58 | 7.52 | 3.52 | 31.61 | 33.85 | 25691.16 | 1.00 |
8 rows × 31 columns
class_0 = df.loc[df['Class'] == 0]["Time"]
class_1 = df.loc[df['Class'] == 1]["Time"]
hist_data = [class_0, class_1]
group_labels = ['Not Fraud', 'Fraud']
fig = ff.create_distplot(hist_data, group_labels, show_hist=False, show_rug=False)
fig['layout'].update(title='Credit Card Transactions Time Density Plot', xaxis=dict(title='Time [s]'))
iplot(fig, filename='dist_only')
the dips in regular transactions are probably due to low acticvity during the night in europe, the fraud transactions have a more even distribution.
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()
visualizing the correlation of the variables, we can notice that V2, V4, V11, V12, V14, V17 have a noticeable correlation with 'Class'.
2. logistic regression¶
we will establish a logistic regression model as a baseline model.
SMOTE is used to handle the imbalance.
X = df.drop('Class', axis=1)
y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
scaling the variables is necessary for this PCA transformed dataset when using gradient-based algorithms like the logistic regression.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)
#train the model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train_resampled)
y_pred = log_reg.predict(X_test_scaled)
y_pred_prob = log_reg.predict_proba(X_test_scaled)[:, 1]
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
pr_auc = auc(recall, precision)
print("Precision-Recall AUC:", pr_auc)
roc_auc = roc_auc_score(y_test, y_pred_prob)
print("ROC-AUC Score:", roc_auc)
Classification Report:
precision recall f1-score support
0 1.00 0.99 0.99 56864
1 0.13 0.90 0.23 98
accuracy 0.99 56962
macro avg 0.57 0.94 0.61 56962
weighted avg 1.00 0.99 0.99 56962
Confusion Matrix:
[[56299 565]
[ 10 88]]
Precision-Recall AUC: 0.7776995057198918
ROC-AUC Score: 0.9772273516187566
# Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()
the logistic regression shows good recall and bad precision. the Precision-Recall Curve plot shows that the best scenario for a balanced model is around 0.8 for both prescisoin and recall.
recall is highly important for fraud detection models because of the high cost of missing fraud cases.
next we will compare this model with a random forrest regression.
3. random forrest regression¶
we will use functions in order to train and evaluate the model to make future adjustments easier.
#base model
def train_random_forest(X_train, y_train):
rf = RandomForestClassifier(n_estimators=200, max_depth=None, class_weight='balanced', random_state=42)
#random forest isn't prone to overfitting, but 200 estimators is inside the norm for these projects and it's as high as my pc will go
rf.fit(X_train, y_train)
return rf
rf = train_random_forest(X_train, y_train)
#evaluation
def evaluate_model(model, X_test, y_test):
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
return y_prob
y_prob = evaluate_model(rf, X_test, y_test)
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.96 0.76 0.85 98
accuracy 1.00 56962
macro avg 0.98 0.88 0.92 56962
weighted avg 1.00 1.00 1.00 56962
AUC-ROC: 0.9571890288895525
Confusion Matrix:
[[56861 3]
[ 24 74]]
average_fraud_amount = df.loc[df['Class'] == 1, 'Amount'].mean()
average_fraud_amount
122.21132113821139
we won't adjust the class weight in the model because the lack of information about the cost of false positives.
the cost of false negative can be calculated, so if we recieve information about the false positive cost, it would be easier to tune the model.
def plot_precision_recall_curve(y_test, y_prob):
precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
plt.plot(recall, precision, marker='.')
plt.title('Precision-Recall Curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid(which='both', alpha=1)
plt.xticks(np.arange(0, 1.1, 0.1))
plt.yticks(np.arange(0, 1.1, 0.05))
plt.show()
plot_precision_recall_curve(y_test, y_prob)
as we can see in the graph the random forrest model shows a better balance between accuarcy and precision,
we will add a function for threshold adjustment and find the optimal point.
# Threshold Adjustment
def adjust_threshold(model, X_test, y_test, threshold=0.5):
y_prob = model.predict_proba(X_test)[:, 1]
# Adjust threshold
y_pred_threshold = (y_prob >= threshold).astype(int)
print(f"Classification Report at Threshold {threshold}:")
print(classification_report(y_test, y_pred_threshold))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_threshold))
return y_pred_threshold
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.5)
Classification Report at Threshold 0.5:
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.96 0.76 0.85 98
accuracy 1.00 56962
macro avg 0.98 0.88 0.92 56962
weighted avg 1.00 1.00 1.00 56962
Confusion Matrix:
[[56861 3]
[ 24 74]]
before we start tuning, we check the precision-recall curve to understand the potential of the threshold adjustment.
we can tune until we get to 0.85 recall
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.45)
Classification Report at Threshold 0.45:
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.96 0.77 0.85 98
accuracy 1.00 56962
macro avg 0.98 0.88 0.93 56962
weighted avg 1.00 1.00 1.00 56962
Confusion Matrix:
[[56861 3]
[ 23 75]]
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.4)
Classification Report at Threshold 0.4:
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.95 0.80 0.87 98
accuracy 1.00 56962
macro avg 0.98 0.90 0.93 56962
weighted avg 1.00 1.00 1.00 56962
Confusion Matrix:
[[56860 4]
[ 20 78]]
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.35)
Classification Report at Threshold 0.35:
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.95 0.82 0.88 98
accuracy 1.00 56962
macro avg 0.98 0.91 0.94 56962
weighted avg 1.00 1.00 1.00 56962
Confusion Matrix:
[[56860 4]
[ 18 80]]
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.3)
Classification Report at Threshold 0.3:
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.93 0.84 0.88 98
accuracy 1.00 56962
macro avg 0.97 0.92 0.94 56962
weighted avg 1.00 1.00 1.00 56962
Confusion Matrix:
[[56858 6]
[ 16 82]]
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.25)
Classification Report at Threshold 0.25:
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.88 0.85 0.86 98
accuracy 1.00 56962
macro avg 0.94 0.92 0.93 56962
weighted avg 1.00 1.00 1.00 56962
Confusion Matrix:
[[56853 11]
[ 15 83]]
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.2)
Classification Report at Threshold 0.2:
precision recall f1-score support
0 1.00 1.00 1.00 56864
1 0.86 0.85 0.86 98
accuracy 1.00 56962
macro avg 0.93 0.92 0.93 56962
weighted avg 1.00 1.00 1.00 56962
Confusion Matrix:
[[56851 13]
[ 15 83]]
with the threshold of 0.25 we reach f1-score of 0.88 and high recall rate without reducing precision.
conclusons¶
- The threshold-adjusted Random Forest model is recommended for deployment, given its ability to maintain high recall while significantly improving precision. This ensures effective fraud detection, minimizing financial loss, and reduced false alarms, maximizing user experience.
- future improvements: implement feature importance analysis and time-based patterns to further enhance the model, and experiment with XGBoosting, to see if performance can be further improved.